{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b1533081",
   "metadata": {},
   "source": [
    "## Notebook 6 - Adding negative enzyme-substrate combinations"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "508eb0a8",
   "metadata": {},
   "source": [
    "This notebook generates a csv listing all substrate-enzyme combinations and whether they were productive or not."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ff885529",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run ../common.py"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f69760d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/vblay/micromamba/envs/project_gt/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import umap\n",
    "from sklearn.cluster import KMeans\n",
    "from rdkit.Chem import AllChem, MolFromSmiles\n",
    "from matplotlib import colormaps"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "5e2853ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_substrates = pd.read_csv(filepath_results + 'Substrates_VB_clean.csv', encoding='utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "c10e3b51",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in [0.75,0.8,0.85,0.9]:\n",
    "\n",
    "    df =  pd.read_pickle(filepath_results + f\"Screening_results_CosineScore_{i}.pkl\")\n",
    "    assert all(df.CSMILES.isin(df_substrates.CSMILES))\n",
    "\n",
    "    # For each enzyme, we will check whether the enzyme + name combination is in df. If not, we will append the entry to df.\n",
    "\n",
    "    exp_ids = []\n",
    "    substrates = df_substrates['Name'].unique()\n",
    "    for _, row in df.iterrows():\n",
    "        enzyme = row['Enzyme_name']\n",
    "        name = row['Name']\n",
    "        exp_id = enzyme + '_' + name\n",
    "        exp_ids.append(exp_id)\n",
    "        \n",
    "    possible_ids = []\n",
    "    for enzyme in enzymes_inclusion:\n",
    "        for name in substrates:\n",
    "            possible_id = enzyme + '_' + name\n",
    "            possible_ids.append(possible_id)\n",
    "\n",
    "    negative_ids = set(possible_ids) - set(exp_ids)\n",
    "\n",
    "    enzs = []\n",
    "    names = []\n",
    "    for neg_id in negative_ids:\n",
    "        enzyme, name = neg_id.rsplit(\"_\",1)\n",
    "        enzs.append(enzyme)\n",
    "        names.append(name)\n",
    "\n",
    "    df_negatives = pd.DataFrame(zip(enzs, names), columns = ['Enzyme_name', 'Name'])\n",
    "\n",
    "    csmiles_map = dict(zip(df_substrates['Name'], df_substrates['CSMILES']))\n",
    "    csmiles_neg = [csmiles_map[n] for n in df_negatives['Name']]\n",
    "\n",
    "    df_negatives['CSMILES'] = csmiles_neg\n",
    "    df_negatives['AUC'] = 0.\n",
    "\n",
    "    df = df.drop_duplicates(subset=['Name', 'Enzyme_name'], keep='first')\n",
    "    df['AUC'] = df.apply(lambda row: max(row['AUC_single'],row['AUC_double']), axis=1)\n",
    "    df = df[['Enzyme_name', 'Name','CSMILES', 'AUC']]\n",
    "    df = pd.concat([df, df_negatives])\n",
    "    df['AUC_binary'] = 1*(df['AUC']>0)\n",
    "\n",
    "    df.to_csv(filepath_results + f'/All_singlesordoubles_{i}.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c59f9ecd",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "project_gt",
   "language": "python",
   "name": "project_gt"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.14"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
